Libraries

library(ggplot2)
library(plyr)
library(dplyr)
library(formattable)
library(caret)
library(knitr)
library(kableExtra)
library(plotly)
library(corrplot)
library(ggthemes)
library(doParallel)
library(parallel)
library(corrplot)
library(purrr)

Import Sample data

CSData <- read.csv("CaseStudy2-data.csv", header = TRUE)

Analysis job roles and turnover

#Get Percentage of Roles and turnover
RoleTotal <- plyr::count(CSData$JobRole)
#Format columns
names(RoleTotal)[1] <- "JobRole"
names(RoleTotal)[2] <- "Total"
RoleTotal$JobRole <- as.character(RoleTotal$JobRole)
#Only employees with attrition
AttrY <- dplyr::filter(CSData, CSData$Attrition == "Yes")
#Count of number of employees in role who quit
AttrRole <-  plyr::count(AttrY$JobRole)
#Format Columns Attr Role
names(AttrRole)[1] <- "JobRole"
names(AttrRole)[2] <- "Attr"
AttrRole$JobRole <- as.character(AttrRole$JobRole)
#Merge RoleTotal with AttrRole
Role_Attr_Total <- merge(RoleTotal,AttrRole)
# Add a column that is the calculated percentage of Job Role turnover 
Role_Attr_Total <- mutate(Role_Attr_Total, Attr_Percent = Attr / Total)
#Format the Attr_Percent column to Percentage
Role_Attr_Total$Attr_Percent <- percent(Role_Attr_Total$Attr_Percent)
#Arrange the data in Descending Order by Attribution Percentage
Role_Attr_Total <- Role_Attr_Total %>% arrange(desc(Attr_Percent))

kable(Role_Attr_Total) %>% kable_styling() %>% column_spec(4, bold = TRUE)
JobRole Total Attr Attr_Percent
Sales Representative 53 24 45.28%
Human Resources 27 6 22.22%
Laboratory Technician 153 30 19.61%
Research Scientist 172 32 18.60%
Sales Executive 200 33 16.50%
Healthcare Representative 76 8 10.53%
Manager 51 4 7.84%
Manufacturing Director 87 2 2.30%
Research Director 51 1 1.96%
#Plot of Attrition in each Job role
p <- ggplot(data = CSData, aes(JobRole, fill = (Attrition == "Yes")))
p + geom_bar() + coord_flip() + labs(title = "Attrition in each Job Role", x = "Job Role", y = "Number of Employees") + 
  scale_fill_hc(name = "Attrition", labels = c("Total Employees", "Attrition") ) + theme_linedraw()

Drill down into top three highest job roles with attrition

Sales Representative Attrition analysis

# Sales Rep Attrition
  SalesReps <- filter(CSData, JobRole == "Sales Representative")
  AttrYSales <- filter(AttrY, JobRole == "Sales Representative")

# Average years at the company. Sales Reps that leave on average are at the company less than 2.5 years.
mean(SalesReps$YearsAtCompany)
## [1] 2.924528
mean(AttrYSales$YearsAtCompany)
## [1] 2.375
# Most sales reps that stay with the company are 30 and over
plot(SalesReps$Attrition, SalesReps$Age, xlab = "Attrition", ylab = "Age", main = "Sales Rep Attrition and Age")

# Job Satisfaction Ratings for Sales Reps with Attrition is never above a 3.0 and averages 2.5
plot(SalesReps$Attrition, SalesReps$JobSatisfaction, xlab = "Attrition", ylab = "Job Satisfaction", main = "Sales Rep Attrition and Job Satisfaction")

# Turnover and Distance from home in general sales reps that leave the company live farther away then the 5mile average that sales reps who stay live
plot(SalesReps$Attrition, SalesReps$DistanceFromHome, xlab = "Attrition", ylab = "Distance from home", main = "Sales Rep Attrition and Distance from Home")

#The reps that leave are disproportionatly Single
plot(SalesReps$Attrition, SalesReps$MaritalStatus, xlab = "Attrition", ylab = "Marriage Status", main = "Sales Rep Attrition and Marital Status")

Human Resources Attrition analysis

#Filter on Human Resources
  HR <- filter(CSData, JobRole == "Human Resources")

# HR Rep turnover average age is less thatn 30
plot(HR$Attrition, HR$Age, xlab = "Attrition", ylab = "Age", main = "Human Resources Attrition and Age")

# HR Reps that left company lived much farther from work on average 20 miles away
plot(HR$Attrition, HR$DistanceFromHome, xlab = "Attrition", ylab = "Distance From Home", main = "Human Resources Attrition and Distance from home")

#Job Satisfaction of those that left averaged 2
plot(HR$Attrition, HR$JobSatisfaction, xlab = "Attrition", ylab = "Job Satisfaction", main = "Human Resources Attrition and Job Satisfaction")

# Total working years of those that stayed averaged 7 years those that quite averaged 2
plot(HR$Attrition, HR$TotalWorkingYears, xlab = "attrition", ylab = "Job Satisfaction", main = "Human Resources Atrition and Total Working Years")

# Those that stay have been with the company an average of five years those that leave Less than two
plot(HR$Attrition, HR$YearsAtCompany, xlab = "Attrition", ylab = "Years At Company", main = "Human Rsources Attrition and Years at the Company")

Laboratory Technician Attrition analysis

 # Lab Techs
  LabTech <- filter(CSData, CSData$JobRole == "Laboratory Technician")

#Lab Techs that quit had a much lower Environment Satisfaction than the averge of 3
plot(LabTech$Attrition, LabTech$EnvironmentSatisfaction, xlab = "Attrition", ylab = "Environment Satisfaction", main = "Lab Tech Attrition and Environment Satisfaction")

#Lab
plot(LabTech$Attrition, LabTech$Age, xlab = "Attrition", ylab = "Age", main = "Lab Tech Attrition and Age")

#Lab tech and distance from home
plot(LabTech$Attrition, LabTech$DistanceFromHome, xlab = "Attrition", ylab = "Distance From Home", main = "Lab Tech Attrition and Distance from home")

#Lab tech and total working years
plot(LabTech$Attrition, LabTech$TotalWorkingYears, xlab = "Attrition", ylab = "Total Working Years", main = "Lab Tech Attrition and Total working years")

# Single lab techs are significantly more likely to leave
plot(LabTech$Attrition, LabTech$MaritalStatus, xlab = "Attrition",ylab = "Marital Status",main = "Lab Tech Attrition and Marital Status" )

Predictive model to identify employees that are likely to leave the company

Find coorolation to Attrition

#Look at the proportion of each variable that influences attrition

#Step 1 remove data that is not going to be useful for finding attrition ID, Employee Number, Standard Hours, and Over18
CSData_AttrUseful <- CSData %>% select(- c(ID,EmployeeNumber, StandardHours, Over18, EmployeeCount))

#Create a function that will create a plot for each variable
AttrPlot <- function(df, x, y){
  ggplot(data = df, aes_string(x = x, fill = y)) +
    geom_bar(alpha = .9, position = "fill") +
    coord_flip() + labs(x = x, y = "Attrition") + theme_hc()+ scale_fill_hc()
}

yname <- "Attrition"
xname <- names(CSData_AttrUseful[-ncol(CSData_AttrUseful)])

lapply(xname, function(x) AttrPlot(df = CSData_AttrUseful, x = x, y = yname))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

## 
## [[14]]

## 
## [[15]]

## 
## [[16]]

## 
## [[17]]

## 
## [[18]]

## 
## [[19]]

## 
## [[20]]

## 
## [[21]]

## 
## [[22]]

## 
## [[23]]

## 
## [[24]]

## 
## [[25]]

## 
## [[26]]

## 
## [[27]]

## 
## [[28]]

## 
## [[29]]

## 
## [[30]]

Test Classifier models naive bayes and knn for best fit

#Remove columns that are not useful
CSData_AttrUseful <- CSData %>% select(- c(ID,EmployeeNumber, StandardHours, Over18, EmployeeCount))

#Create training and test data
set.seed(8)
TrainObs <- createDataPartition(y = CSData_AttrUseful$Attrition, p = .60, list = FALSE)
#Create the training observations for Attrition
AttrTrain <- CSData_AttrUseful[TrainObs,]

#Create the test Observations for Attrition
AttrTest <- CSData_AttrUseful[-TrainObs,]
#Set the training control method
trainMethod <- trainControl(method = "repeatedcv", number =  25, repeats = 5, summaryFunction = twoClassSummary, classProbs = TRUE)

#Check number of cores for parallel processing
parallel::detectCores() #4 cores detected on iMac used for study
## [1] 4
#Assign cores to run this training model
workers <- makeCluster(3L)

#Sets up workers to run training
registerDoParallel(workers)

Naive Bayes method

#Fit the Naives Bayes model
fit.nb <- train(Attrition ~., data = AttrTrain, method = "nb", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)

Predict,Summary, and assessment of Naive Bayes model

#Predictions based on Naives Bayes method
pred.nb <- predict(fit.nb, AttrTest)

#Summary of Naives Bayes predicions
summary(pred.nb)
##  No Yes 
## 214 134
#Confusion Matrix to assess model
confusionMatrix(pred.nb, AttrTest$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  195  19
##        Yes  97  37
##                                          
##                Accuracy : 0.6667         
##                  95% CI : (0.6144, 0.716)
##     No Information Rate : 0.8391         
##     P-Value [Acc > NIR] : 1              
##                                          
##                   Kappa : 0.2102         
##                                          
##  Mcnemar's Test P-Value : 8.724e-13      
##                                          
##             Sensitivity : 0.6678         
##             Specificity : 0.6607         
##          Pos Pred Value : 0.9112         
##          Neg Pred Value : 0.2761         
##              Prevalence : 0.8391         
##          Detection Rate : 0.5603         
##    Detection Prevalence : 0.6149         
##       Balanced Accuracy : 0.6643         
##                                          
##        'Positive' Class : No             
## 

KNN method

fit.knn <- train(Attrition ~., data = AttrTrain, method = "knn", metric = "Spec", trControl = trainMethod, preProcess = c("center","scale"), tuneLength = 31)

Predict,Summary, and assessment of KNN model

#Predictions based on Naives Bayes method
pred.knn <- predict(fit.knn, AttrTest)

#Summary of Naives Bayes predicions
summary(pred.knn)
##  No Yes 
## 333  15
#Confusion Matrix to assess model
confusionMatrix(pred.knn, AttrTest$Attrition)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  No Yes
##        No  287  46
##        Yes   5  10
##                                           
##                Accuracy : 0.8534          
##                  95% CI : (0.8119, 0.8889)
##     No Information Rate : 0.8391          
##     P-Value [Acc > NIR] : 0.2588          
##                                           
##                   Kappa : 0.2293          
##                                           
##  Mcnemar's Test P-Value : 2.13e-08        
##                                           
##             Sensitivity : 0.9829          
##             Specificity : 0.1786          
##          Pos Pred Value : 0.8619          
##          Neg Pred Value : 0.6667          
##              Prevalence : 0.8391          
##          Detection Rate : 0.8247          
##    Detection Prevalence : 0.9569          
##       Balanced Accuracy : 0.5807          
##                                           
##        'Positive' Class : No              
## 

Classification Model fit conclusion

Based on the output of both models the KNN model has high Accuracy and Sensitivity but it’s specificity is only 18%. The Naive Bayes models meets all the criteria required for Accuracy, Sensitiy, and Specificity all being over 60%.

Predicting Monthly Income

The most significant variables that corelate to Monthly income are Job Level (95%) and Total Working years (78%)

#function to create corrolation heatmap
correlator <- function(df){
  df %>%
    keep(is.numeric) %>%
    tidyr::drop_na() %>%
    cor %>%
    corrplot(addCoef.col = "white", number.digits = 2,
             number.cex = .5, method = "square",
             order = "hclust",
             tl.srt = 45, tl.cex = .8)
}

correlator(CSData_AttrUseful)

Comparing Linear regression models to predict Monthly Income using a simple linear model and knn regression

# Create the training and test data for the Monthly Income models
set.seed(12)
TrainObs <- createDataPartition(y = CSData_AttrUseful$Attrition, p = .60, list = FALSE)

#Create the training observations for Monthly Income
MITrain <- CSData_AttrUseful[TrainObs,]

#Create the test Observations for Monthly Income
MITest <- CSData_AttrUseful[-TrainObs,]
# Set the training method for the regression models
trainMethod2 <- trainControl(method = "repeatedcv", number =  25, repeats = 5)

Fit a simple linear regression model

# Fit lm model
fit.lm <- train(MonthlyIncome ~., data = MITrain, method = "lm", trControl = trainMethod2)

# Check RMSE of linear model
fit.lm
## Linear Regression 
## 
## 522 samples
##  30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times) 
## Summary of sample sizes: 501, 501, 501, 500, 502, 501, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1088.389  0.9385737  853.8985
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Fit a knn regression model

# Fit knn regression model
fit.knnreg <- train(MonthlyIncome ~., data = MITrain, method = "knn", trControl = trainMethod2)

# Check RMSE of knn regression model
fit.knnreg
## k-Nearest Neighbors 
## 
## 522 samples
##  30 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (25 fold, repeated 5 times) 
## Summary of sample sizes: 501, 499, 501, 502, 500, 501, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE      Rsquared    MAE     
##   5  4924.749  0.04703560  3749.325
##   7  4756.036  0.04941159  3620.643
##   9  4691.593  0.04871913  3606.306
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 9.

Regression Model fit Conclucsion

The simple linear model had a lower RMSE than the knn model and a much higher Rsquared. The linear regression model is a better fit than the knn